In [1]:
import pandas as pd #Data Manipulation
import numpy as np #Data Manipulation

import matplotlib.pyplot as plt #Plotting
import seaborn as sns #Plotting
sns.set(style='white')

from sklearn import preprocessing #Preprocessing

from scipy.stats import skew, boxcox_normmax #Preprocessing
from scipy.special import boxcox1p #Preprocessing

from sklearn.model_selection import train_test_split #Train/Test Split
from sklearn.linear_model import LogisticRegression #Model

from sklearn.metrics import classification_report #Metrics
from sklearn.metrics import confusion_matrix #Metrics
from sklearn.metrics import accuracy_score #Metrics
from sklearn.metrics import roc_auc_score, roc_curve #ROC
from sklearn import model_selection #Cross Validation
from sklearn.feature_selection import RFE, RFECV #Feature Selection

Data Loading

In [2]:
hr = pd.read_csv('C:\\Users\\RIA SHARMA\\Desktop\\data\\data\\turnover.csv')
hr.head()
Out[2]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low

Data Preparation

Variables Types and Definitions

In [3]:
# Shape of the data frame
print('Rows:', hr.shape[0], '| Columns:', hr.shape[1])
Rows: 14999 | Columns: 10
In [4]:
# Describe each variable
def df_desc(df):
    import pandas as pd
    desc = pd.DataFrame({'dtype': df.dtypes,
                         'NAs': df.isna().sum(),
                         'Numerical': (df.dtypes != 'object') & (df.apply(lambda column: column == 0).sum() + df.apply(lambda column: column == 1).sum() != len(df)),
                         'Boolean': df.apply(lambda column: column == 0).sum() + df.apply(lambda column: column == 1).sum() == len(df),
                         'Categorical': df.dtypes == 'object',
                        })
    return desc

df_desc(hr)
Out[4]:
dtype NAs Numerical Boolean Categorical
satisfaction_level float64 0 True False False
last_evaluation float64 0 True False False
number_project int64 0 True False False
average_montly_hours int64 0 True False False
time_spend_company int64 0 True False False
Work_accident int64 0 False True False
left int64 0 False True False
promotion_last_5years int64 0 False True False
sales object 0 False False True
salary object 0 False False True
In [5]:
# Summarize numercial variables
hr.describe()
Out[5]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.612834 0.716102 3.803054 201.050337 3.498233 0.144610 0.238083 0.021268
std 0.248631 0.171169 1.232592 49.943099 1.460136 0.351719 0.425924 0.144281
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.640000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000
In [6]:
# Lists values of categorical variables
categories = {'sales': hr['sales'].unique().tolist(),
 'salary':hr['salary'].unique().tolist()}
for i in sorted(categories.keys()):
    print(i+":")
    print(categories[i])
    if i != sorted(categories.keys())[-1] :print("\n")
salary:
['low', 'medium', 'high']


sales:
['sales', 'accounting', 'hr', 'technical', 'support', 'management', 'IT', 'product_mng', 'marketing', 'RandD']
In [7]:
# Rename variable sales
hr = hr.rename(index=str, columns={'sales':'department'})

Exploratory Data Analysis

Target Proportion

In [8]:
# Count occurences of each values in left
hr['left'].value_counts()
Out[8]:
0    11428
1     3571
Name: left, dtype: int64

23.8% of the employees listed in the dataset have left the company.

In [9]:
# Get the mean of each variable for the different values of left
hr.groupby('left').mean()
Out[9]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident promotion_last_5years
left
0 0.666810 0.715473 3.786664 199.060203 3.380032 0.175009 0.026251
1 0.440098 0.718113 3.855503 207.419210 3.876505 0.047326 0.005321

Correlation Analysis

In [10]:
# Correlation Matrix
plt.figure(figsize=(12,8))
sns.heatmap(hr.corr(), cmap='RdBu', annot=True)
plt.tight_layout()
In [11]:
# Pair Plot
plot = sns.PairGrid(hr, hue='left', palette=('steelblue', 'crimson'))
plot = plot.map_diag(plt.hist)
plot = plot.map_offdiag(plt.scatter)
plot.add_legend()
plt.tight_layout()

Turnover by Salary Levels

In [12]:
# Salary Levels proportions and turnover rates
print('Salary Levels proportions')
print(hr['salary'].value_counts()/len(hr)*100)
print('\n')
print('Turnover Rate by Salary level')
print(hr.groupby('salary')['left'].mean())
Salary Levels proportions
low       48.776585
medium    42.976198
high       8.247216
Name: salary, dtype: float64


Turnover Rate by Salary level
salary
high      0.066289
low       0.296884
medium    0.204313
Name: left, dtype: float64

Turnover by Departments

In [13]:
# Departments proportions
hr['department'].value_counts()/len(hr)*100
Out[13]:
sales          27.601840
technical      18.134542
support        14.860991
IT              8.180545
product_mng     6.013734
marketing       5.720381
RandD           5.247016
accounting      5.113674
hr              4.926995
management      4.200280
Name: department, dtype: float64
In [14]:
# Turnover Rate by Department
hr.groupby('department')['left'].mean().sort_values(ascending=False).plot(kind='bar', color='steelblue')
plt.title('Departure Ratio by Department')
plt.xlabel('')
plt.tight_layout()

Turnover by Satisfaction Level

In [15]:
# Bar Plot
plt.figure(figsize=(15,5))
sns.distplot(hr.satisfaction_level,
             bins = 20,
             color = 'steelblue').axes.set_xlim(min(hr.satisfaction_level),max(hr.satisfaction_level))
plt.tight_layout()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
In [16]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['satisfaction_level'],
              hue = hr['left'],
              palette = ('steelblue', 'crimson'))
plt.tight_layout()

Turnover by Last Evaluation

In [17]:
# Bar Plot
plt.figure(figsize=(15,5))
sns.distplot(hr.last_evaluation,
             bins = 20,
             color = 'steelblue').axes.set_xlim(min(hr.last_evaluation),max(hr.last_evaluation))
plt.tight_layout()
In [18]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['last_evaluation'],
              hue = hr['left'],
              palette = ('steelblue', 'crimson'))
plt.tight_layout()

Turnover by Number of Projects

In [19]:
# Bar Plot
plt.figure(figsize=(15,5))
sns.distplot(hr.number_project,
             bins = 20,
             color = 'steelblue').axes.set_xlim(min(hr.number_project),max(hr.number_project))
plt.tight_layout()
In [20]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['number_project'],
              hue = hr['left'],
              palette = ('steelblue', 'crimson'))
plt.tight_layout()

Turnover by Average Monthly Hours

In [21]:
# Bar Plot
plt.figure(figsize=(15,5))
sns.distplot(hr.average_montly_hours,
             bins = 20,
             color = 'steelblue').axes.set_xlim(min(hr.average_montly_hours),max(hr.average_montly_hours))
plt.tight_layout()
In [22]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['average_montly_hours'],
              hue = hr['left'],
              palette = ('steelblue', 'crimson'))
plt.tight_layout()

Turnover by Time Spent in the Company

In [23]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['time_spend_company'],
              hue = hr['left'],
              palette = ('steelblue', 'crimson'))
plt.tight_layout()

Turnover by Work Accident

In [24]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['Work_accident'],
              hue = hr['left'],
              palette = ('steelblue', 'crimson'))
plt.tight_layout()

Turnover by Promotion within the past 5 years

In [25]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['promotion_last_5years'],
              hue = hr['left'],
              palette = ('steelblue', 'crimson'))
plt.tight_layout()
In [26]:
print('Turnover Rate if Promotion:', round(len(hr[(hr['promotion_last_5years']==1)&(hr['left']==1)])/len(hr[(hr['promotion_last_5years']==1)])*100,2),'%')
print('Turnover Rate if No Promotion:', round(len(hr[(hr['promotion_last_5years']==0)&(hr['left']==1)])/len(hr[(hr['promotion_last_5years']==0)])*100,2),'%')
Turnover Rate if Promotion: 5.96 %
Turnover Rate if No Promotion: 24.2 %

Number of Projects vs Average Monthly Hours

In [27]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.barplot(x=hr.average_montly_hours,
            y=hr.number_project,
            hue=hr.left,
            palette = ('steelblue', 'crimson'))
plt.tight_layout()
In [28]:
# Scatter Plot with left values
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr.average_montly_hours,
            y=hr.number_project,
            hue=hr.left,
            palette = ('steelblue', 'crimson'))
plt.tight_layout()

Number of Projects vs Last Evaluation

In [29]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.barplot(x=hr.last_evaluation,
            y=hr.number_project,
            hue=hr.left,
            palette = ('steelblue', 'crimson'))
plt.tight_layout()
In [30]:
# Scatter Plot with left values
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr.last_evaluation,
            y=hr.number_project,
            hue=hr.left,
            palette = ('steelblue', 'crimson'))
plt.tight_layout()

Last Evaluation vs Average Monthly Hours

In [31]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.barplot(x=hr.average_montly_hours,
            y=hr.last_evaluation,
            hue=hr.left,
            palette = ('steelblue', 'crimson'))
plt.tight_layout()
In [32]:
# Scatter Plot with left values
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr.average_montly_hours,
            y=hr.last_evaluation,
            hue=hr.left,
            palette = ('steelblue', 'crimson'))
plt.tight_layout()

Last Evaluation vs Satisfaction Level

In [33]:
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.barplot(x=hr.satisfaction_level,
            y=hr.last_evaluation,
            hue=hr.left,
            palette = ('steelblue', 'crimson'))
plt.tight_layout()
In [34]:
# Scatter Plot with left values
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr.satisfaction_level,
            y=hr.last_evaluation,
            hue=hr.left,
            palette = ('steelblue', 'crimson'))
plt.tight_layout()

Encoding Categorical Variables

In [35]:
# Encoding the variable salary
salary_dict = {'low':0,'medium':1,'high':2}
hr['salary_num'] = hr.salary.map(salary_dict)
hr.drop('salary', inplace=True, axis=1)
hr = hr.rename(index=str, columns={'salary_num':'salary'})
hr.head()
Out[35]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years department salary
0 0.38 0.53 2 157 3 0 1 0 sales 0
1 0.80 0.86 5 262 6 0 1 0 sales 1
2 0.11 0.88 7 272 4 0 1 0 sales 1
3 0.72 0.87 5 223 5 0 1 0 sales 0
4 0.37 0.52 2 159 3 0 1 0 sales 0
In [36]:
def numerical_features(df):
    columns = df.columns
    return df._get_numeric_data().columns

def categorical_features(df):
    numerical_columns = numerical_features(df)
    return(list(set(df.columns) - set(numerical_columns)))

def onehot_encode(df):
    numericals = df.get(numerical_features(df))
    new_df = numericals.copy()
    for categorical_column in categorical_features(df):
        new_df = pd.concat([new_df, 
                            pd.get_dummies(df[categorical_column], 
                                           prefix=categorical_column)], 
                           axis=1)
    return new_df
In [37]:
hr_encoded = onehot_encode(hr)
hr_encoded.head()
Out[37]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years salary department_IT department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical
0 0.38 0.53 2 157 3 0 1 0 0 0 0 0 0 0 0 0 1 0 0
1 0.80 0.86 5 262 6 0 1 0 1 0 0 0 0 0 0 0 1 0 0
2 0.11 0.88 7 272 4 0 1 0 1 0 0 0 0 0 0 0 1 0 0
3 0.72 0.87 5 223 5 0 1 0 0 0 0 0 0 0 0 0 1 0 0
4 0.37 0.52 2 159 3 0 1 0 0 0 0 0 0 0 0 0 1 0 0
In [38]:
df_desc(hr_encoded)
Out[38]:
dtype NAs Numerical Boolean Categorical
satisfaction_level float64 0 True False False
last_evaluation float64 0 True False False
number_project int64 0 True False False
average_montly_hours int64 0 True False False
time_spend_company int64 0 True False False
Work_accident int64 0 False True False
left int64 0 False True False
promotion_last_5years int64 0 False True False
salary int64 0 True False False
department_IT uint8 0 False True False
department_RandD uint8 0 False True False
department_accounting uint8 0 False True False
department_hr uint8 0 False True False
department_management uint8 0 False True False
department_marketing uint8 0 False True False
department_product_mng uint8 0 False True False
department_sales uint8 0 False True False
department_support uint8 0 False True False
department_technical uint8 0 False True False

Scaling and Skewness

In [39]:
hr_encoded[['satisfaction_level',
           'last_evaluation',
           'average_montly_hours'
           ]].hist(bins = 20, figsize = (15,10), color = 'steelblue')
plt.tight_layout()
In [40]:
hr_encoded[['satisfaction_level',
           'last_evaluation',
           'average_montly_hours'
           ]].describe()
Out[40]:
satisfaction_level last_evaluation average_montly_hours
count 14999.000000 14999.000000 14999.000000
mean 0.612834 0.716102 201.050337
std 0.248631 0.171169 49.943099
min 0.090000 0.360000 96.000000
25% 0.440000 0.560000 156.000000
50% 0.640000 0.720000 200.000000
75% 0.820000 0.870000 245.000000
max 1.000000 1.000000 310.000000
In [41]:
scaler = preprocessing.MinMaxScaler()
hr_scaled_part = scaler.fit_transform(hr_encoded[['satisfaction_level',
                                                  'last_evaluation',
                                                  'average_montly_hours']])
hr_scaled_part = pd.DataFrame(hr_scaled_part, columns=list(['satisfaction_level',
                                                  'last_evaluation',
                                                  'average_montly_hours']))
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:323: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.
  return self.partial_fit(X, y)
In [42]:
hr_scaled_part[['satisfaction_level',
                'last_evaluation',
                'average_montly_hours']].hist(bins = 20, figsize = (15,10), color = 'steelblue')
plt.tight_layout()
In [43]:
hr_scaled_part.describe()
Out[43]:
satisfaction_level last_evaluation average_montly_hours
count 14999.000000 14999.000000 14999.000000
mean 0.574542 0.556409 0.490889
std 0.273220 0.267452 0.233379
min 0.000000 0.000000 0.000000
25% 0.384615 0.312500 0.280374
50% 0.604396 0.562500 0.485981
75% 0.802198 0.796875 0.696262
max 1.000000 1.000000 1.000000
In [44]:
def feature_skewness(df):
    numeric_dtypes = ['int16', 'int32', 'int64', 
                      'float16', 'float32', 'float64']
    numeric_features = []
    for i in df.columns:
        if df[i].dtype in numeric_dtypes: 
            numeric_features.append(i)

    feature_skew = df[numeric_features].apply(
        lambda x: skew(x)).sort_values(ascending=False)
    skews = pd.DataFrame({'skew':feature_skew})
    return feature_skew, numeric_features
In [45]:
def fix_skewness(df):
    feature_skew, numeric_features = feature_skewness(df)
    high_skew = feature_skew[feature_skew > 0.5]
    skew_index = high_skew.index
    
    for i in skew_index:
        df[i] = boxcox1p(df[i], boxcox_normmax(df[i]+1))

    skew_features = df[numeric_features].apply(
        lambda x: skew(x)).sort_values(ascending=False)
    skews = pd.DataFrame({'skew':skew_features})
    return df
In [46]:
hr_skewed_part = fix_skewness(hr_scaled_part)
In [47]:
hr_skewed_part.hist(bins = 20, figsize = (15,10), color = 'steelblue')
plt.tight_layout()
In [48]:
hr_skewed_part.describe()
Out[48]:
satisfaction_level last_evaluation average_montly_hours
count 14999.000000 14999.000000 14999.000000
mean 0.574542 0.556409 0.490889
std 0.273220 0.267452 0.233379
min 0.000000 0.000000 0.000000
25% 0.384615 0.312500 0.280374
50% 0.604396 0.562500 0.485981
75% 0.802198 0.796875 0.696262
max 1.000000 1.000000 1.000000
In [49]:
hr_simple = hr_encoded.copy()
hr_simple.drop(['satisfaction_level',
                'last_evaluation',
                'average_montly_hours'], inplace=True, axis=1)

hr_ready = pd.DataFrame()
hr_simple.reset_index(drop=True, inplace=True)
hr_skewed_part.reset_index(drop=True, inplace=True)

hr_ready = pd.concat([hr_skewed_part,hr_simple], axis=1, sort=False, ignore_index=False)

# hr_ready['number_project'] = hr_ready['number_project'].astype('category').cat.codes
# hr_ready['time_spend_company'] = hr_ready['time_spend_company'].astype('category').cat.codes

hr_ready.head()
Out[49]:
satisfaction_level last_evaluation average_montly_hours number_project time_spend_company Work_accident left promotion_last_5years salary department_IT department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical
0 0.318681 0.265625 0.285047 2 3 0 1 0 0 0 0 0 0 0 0 0 1 0 0
1 0.780220 0.781250 0.775701 5 6 0 1 0 1 0 0 0 0 0 0 0 1 0 0
2 0.021978 0.812500 0.822430 7 4 0 1 0 1 0 0 0 0 0 0 0 1 0 0
3 0.692308 0.796875 0.593458 5 5 0 1 0 0 0 0 0 0 0 0 0 1 0 0
4 0.307692 0.250000 0.294393 2 3 0 1 0 0 0 0 0 0 0 0 0 1 0 0
In [50]:
df_desc(hr_ready)
Out[50]:
dtype NAs Numerical Boolean Categorical
satisfaction_level float64 0 True False False
last_evaluation float64 0 True False False
average_montly_hours float64 0 True False False
number_project int64 0 True False False
time_spend_company int64 0 True False False
Work_accident int64 0 False True False
left int64 0 False True False
promotion_last_5years int64 0 False True False
salary int64 0 True False False
department_IT uint8 0 False True False
department_RandD uint8 0 False True False
department_accounting uint8 0 False True False
department_hr uint8 0 False True False
department_management uint8 0 False True False
department_marketing uint8 0 False True False
department_product_mng uint8 0 False True False
department_sales uint8 0 False True False
department_support uint8 0 False True False
department_technical uint8 0 False True False
In [51]:
hr_ready.describe()
Out[51]:
satisfaction_level last_evaluation average_montly_hours number_project time_spend_company Work_accident left promotion_last_5years salary department_IT department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.574542 0.556409 0.490889 3.803054 3.498233 0.144610 0.238083 0.021268 0.594706 0.081805 0.052470 0.051137 0.049270 0.042003 0.057204 0.060137 0.276018 0.148610 0.181345
std 0.273220 0.267452 0.233379 1.232592 1.460136 0.351719 0.425924 0.144281 0.637183 0.274077 0.222981 0.220284 0.216438 0.200602 0.232239 0.237749 0.447041 0.355715 0.385317
min 0.000000 0.000000 0.000000 2.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.384615 0.312500 0.280374 3.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.604396 0.562500 0.485981 4.000000 3.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.802198 0.796875 0.696262 5.000000 4.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 7.000000 10.000000 1.000000 1.000000 1.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [52]:
hr_ready.hist(bins = 20, figsize = (15,10), color = 'steelblue')
plt.tight_layout()

Training/Test Split

In [53]:
target = 'left'

split_ratio = 0.3
seed = 806

def split_dataset(df, target, split_ratio=0.3, seed=806):
    features = list(df)
    features.remove(target)

    X = df[features]
    y = df[[target]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=seed)

    return X, y, X_train, X_test, y_train, y_test

X, y, X_train, X_test, y_train, y_test = split_dataset(hr_ready, target, split_ratio, seed)

print('Features:',X.shape[0], 'items | ', X.shape[1],'columns')
print('Target:',y.shape[0], 'items | ', y.shape[1],'columns')
print('Features Train:',X_train.shape[0], 'items | ', X_train.shape[1],'columns')
print('Features Test:',X_test.shape[0], 'items | ', X_test.shape[1],'columns')
print('Target Train:',y_train.shape[0], 'items | ', y_train.shape[1],'columns')
print('Target Test:',y_test.shape[0], 'items | ', y_test.shape[1],'columns')
Features: 14999 items |  18 columns
Target: 14999 items |  1 columns
Features Train: 10499 items |  18 columns
Features Test: 4500 items |  18 columns
Target Train: 10499 items |  1 columns
Target Test: 4500 items |  1 columns

Baseline

In [54]:
lr = LogisticRegression(solver='lbfgs', max_iter = 300)
In [55]:
def lr_run(model, X_train, y_train, X_test, y_test):
    result = model.fit(X_train, y_train.values.ravel())

    y_pred = model.predict(X_test)
    acc_test = model.score(X_test, y_test)
    coefficients = pd.concat([pd.DataFrame(X_train.columns, columns=['Feature']), pd.DataFrame(np.transpose(model.coef_), columns=['Coef.'])], axis = 1)
    coefficients.loc[-1] = ['intercept.', model.intercept_[0]]
    coefficients.index = coefficients.index + 1
    coefficients = coefficients.sort_index()
    
    print('Accuracy on test: {:.3f}'.format(acc_test))
    print()
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print()
    print(coefficients)
In [56]:
lr_run(lr, X_train, y_train, X_test, y_test)
Accuracy on test: 0.797

              precision    recall  f1-score   support

           0       0.82      0.94      0.88      3435
           1       0.63      0.34      0.44      1065

   micro avg       0.80      0.80      0.80      4500
   macro avg       0.73      0.64      0.66      4500
weighted avg       0.78      0.80      0.77      4500

Confusion Matrix:
[[3220  215]
 [ 700  365]]

                   Feature     Coef.
0               intercept.  0.652324
1       satisfaction_level -3.616903
2          last_evaluation  0.440213
3     average_montly_hours  0.910043
4           number_project -0.285365
5       time_spend_company  0.245420
6            Work_accident -1.394749
7    promotion_last_5years -1.189366
8                   salary -0.695795
9            department_IT -0.065209
10        department_RandD -0.474074
11   department_accounting  0.069996
12           department_hr  0.336692
13   department_management -0.352850
14    department_marketing  0.062129
15  department_product_mng  0.040316
16        department_sales  0.019113
17      department_support  0.230854
18    department_technical  0.147288
In [57]:
def plot_roc(model, X_test, y_test):
    logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.05])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    plt.show();
In [58]:
plot_roc(lr, X_test, y_test)

Feature Engineering

Cross Validation Strategy

In [59]:
def cv_acc (model, X_train, y_train, n_splits, seed):
    kfold = model_selection.KFold(n_splits=n_splits, random_state=seed)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(model, X_train, y_train.values.ravel(), cv=kfold, scoring=scoring)
    print("10-fold cross validation average accuracy: %.3f" % (results.mean()))
    print()
    for i in range(len(results)):
        print('Iteration', '{:>2}'.format(i+1), '| Accuracy: {:.2f}'.format(results[i]))
In [60]:
cv_acc(lr, X_train, y_train, 10, seed)
10-fold cross validation average accuracy: 0.789

Iteration  1 | Accuracy: 0.79
Iteration  2 | Accuracy: 0.77
Iteration  3 | Accuracy: 0.78
Iteration  4 | Accuracy: 0.80
Iteration  5 | Accuracy: 0.81
Iteration  6 | Accuracy: 0.79
Iteration  7 | Accuracy: 0.79
Iteration  8 | Accuracy: 0.80
Iteration  9 | Accuracy: 0.79
Iteration 10 | Accuracy: 0.77

Features Construction

The dataset is copied to add or modify features.

In [61]:
hr_fe = hr_ready.copy()

Bin Satisfaction Level

Based on the EDA, we can bin the Satisfaction Level into 6 bins.

In [62]:
bins = [-1, 0.03, 0.29, 0.41, 0.69, 0.92, 1]
labels=['(0.00, 0.11]','(0.11, 0.35]','(0.35, 0.46]','(0.46, 0.71]','(0.71, 0.92]','(0.92, 1.00]']
hr_fe['satisfaction_level_bin'] = pd.cut(hr_fe.satisfaction_level, bins, labels=labels)
hr_fe.satisfaction_level_bin.value_counts()
Out[62]:
(0.71, 0.92]    4765
(0.46, 0.71]    4689
(0.35, 0.46]    2012
(0.92, 1.00]    1362
(0.11, 0.35]    1283
(0.00, 0.11]     888
Name: satisfaction_level_bin, dtype: int64
In [63]:
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe.satisfaction_level,
              hue=hr_fe.satisfaction_level_bin,
              palette = sns.color_palette("hls", 6),
              dodge = False)
plt.tight_layout()
In [64]:
hr_fe_1 = hr_fe.copy()
hr_fe_1 = onehot_encode(hr_fe_1)
hr_fe_1.drop('satisfaction_level', inplace=True, axis=1)
In [65]:
X_fe_1, y_fe_1, X_fe_1_train, X_fe_1_test, y_fe_1_train, y_fe_1_test = split_dataset(hr_fe_1, target, split_ratio, seed)
cv_acc(lr, X_fe_1_train, y_fe_1_train, 10, seed)
print()
lr_run(lr, X_fe_1_train, y_fe_1_train, X_fe_1_test, y_fe_1_test)
10-fold cross validation average accuracy: 0.916

Iteration  1 | Accuracy: 0.92
Iteration  2 | Accuracy: 0.92
Iteration  3 | Accuracy: 0.90
Iteration  4 | Accuracy: 0.91
Iteration  5 | Accuracy: 0.93
Iteration  6 | Accuracy: 0.92
Iteration  7 | Accuracy: 0.92
Iteration  8 | Accuracy: 0.92
Iteration  9 | Accuracy: 0.91
Iteration 10 | Accuracy: 0.91

Accuracy on test: 0.914

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      3435
           1       0.83      0.79      0.81      1065

   micro avg       0.91      0.91      0.91      4500
   macro avg       0.89      0.87      0.88      4500
weighted avg       0.91      0.91      0.91      4500

Confusion Matrix:
[[3266  169]
 [ 220  845]]

                                Feature     Coef.
0                            intercept. -4.097921
1                       last_evaluation  1.884625
2                  average_montly_hours  1.871843
3                        number_project -0.119089
4                    time_spend_company  0.433392
5                         Work_accident -1.199584
6                 promotion_last_5years -1.051113
7                                salary -0.727185
8                         department_IT -0.041412
9                      department_RandD -0.273373
10                department_accounting  0.043892
11                        department_hr  0.588687
12                department_management -0.686792
13                 department_marketing  0.034159
14               department_product_mng -0.082121
15                     department_sales -0.010748
16                   department_support  0.257522
17                 department_technical  0.199894
18  satisfaction_level_bin_(0.00, 0.11]  5.197687
19  satisfaction_level_bin_(0.11, 0.35] -1.583954
20  satisfaction_level_bin_(0.35, 0.46]  3.742319
21  satisfaction_level_bin_(0.46, 0.71] -2.637255
22  satisfaction_level_bin_(0.71, 0.92] -0.407616
23  satisfaction_level_bin_(0.92, 1.00] -4.281470

Bin Last Evaluation

Based on the EDA, we can bin the Last Evaluation into 4 bins.

In [66]:
bins = [-1, 0.14, 0.34, 0.64, 1]
labels=['(0.00, 0.44]','(0.44, 0.57]','(0.57, 0.76]','(0.76, 1.00]']
hr_fe['last_evaluation_bin'] = pd.cut(hr_fe.last_evaluation, bins, labels=labels)
hr_fe_1['last_evaluation_bin'] = pd.cut(hr_fe_1.last_evaluation, bins, labels=labels)
hr_fe_1.last_evaluation_bin.value_counts()
Out[66]:
(0.76, 1.00]    6458
(0.57, 0.76]    4279
(0.44, 0.57]    3817
(0.00, 0.44]     445
Name: last_evaluation_bin, dtype: int64
In [67]:
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe_1.last_evaluation,
              hue=hr_fe_1.last_evaluation_bin,
              palette = sns.color_palette("hls", 6),
              dodge = False)
plt.tight_layout()
In [68]:
hr_fe_2 = hr_fe_1.copy()
hr_fe_2 = onehot_encode(hr_fe_2)
hr_fe_2.drop('last_evaluation', inplace=True, axis=1)
In [69]:
X_fe_2, y_fe_2, X_fe_2_train, X_fe_2_test, y_fe_2_train, y_fe_2_test = split_dataset(hr_fe_2, target, split_ratio, seed)
cv_acc(lr, X_fe_2_train, y_fe_2_train, 10, seed)
print()
lr_run(lr, X_fe_2_train, y_fe_2_train, X_fe_2_test, y_fe_2_test)
10-fold cross validation average accuracy: 0.935

Iteration  1 | Accuracy: 0.93
Iteration  2 | Accuracy: 0.93
Iteration  3 | Accuracy: 0.93
Iteration  4 | Accuracy: 0.93
Iteration  5 | Accuracy: 0.94
Iteration  6 | Accuracy: 0.93
Iteration  7 | Accuracy: 0.95
Iteration  8 | Accuracy: 0.94
Iteration  9 | Accuracy: 0.93
Iteration 10 | Accuracy: 0.93

Accuracy on test: 0.936

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3435
           1       0.88      0.84      0.86      1065

   micro avg       0.94      0.94      0.94      4500
   macro avg       0.92      0.90      0.91      4500
weighted avg       0.94      0.94      0.94      4500

Confusion Matrix:
[[3315  120]
 [ 167  898]]

                                Feature     Coef.
0                            intercept. -5.597721
1                  average_montly_hours  2.193237
2                        number_project  0.058676
3                    time_spend_company  0.463016
4                         Work_accident -1.172396
5                 promotion_last_5years -0.952911
6                                salary -0.723789
7                         department_IT -0.096328
8                      department_RandD -0.214964
9                 department_accounting  0.034619
10                        department_hr  0.619750
11                department_management -0.745848
12                 department_marketing  0.043118
13               department_product_mng -0.109391
14                     department_sales -0.001120
15                   department_support  0.228428
16                 department_technical  0.235004
17  satisfaction_level_bin_(0.00, 0.11]  4.804442
18  satisfaction_level_bin_(0.11, 0.35] -1.522668
19  satisfaction_level_bin_(0.35, 0.46]  3.611293
20  satisfaction_level_bin_(0.46, 0.71] -2.508970
21  satisfaction_level_bin_(0.71, 0.92] -0.263972
22  satisfaction_level_bin_(0.92, 1.00] -4.126858
23     last_evaluation_bin_(0.00, 0.44] -3.358159
24     last_evaluation_bin_(0.44, 0.57]  2.063212
25     last_evaluation_bin_(0.57, 0.76] -0.741659
26     last_evaluation_bin_(0.76, 1.00]  2.029875

Bin Average Monthly Hours

Based on the EDA, we can bin the Average Monthly Hours into 7 bins.

In [70]:
bins = [-1, 0.14, 0.165, 0.304, 0.565, 0.840, 0.897, 1]
labels=['(0, 125]','(125, 131]','(131, 161]','(161, 216]','(216, 274]','(274, 287]','(287, 310]']
hr_fe['average_montly_hours_bin'] = pd.cut(hr_fe.average_montly_hours, bins, labels=labels)
hr_fe_2['average_montly_hours_bin'] = pd.cut(hr_fe_2.average_montly_hours, bins, labels=labels)
hr_fe_2.average_montly_hours_bin.value_counts()
Out[70]:
(216, 274]    5573
(161, 216]    4290
(131, 161]    3588
(0, 125]       486
(274, 287]     379
(125, 131]     353
(287, 310]     330
Name: average_montly_hours_bin, dtype: int64
In [71]:
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe_2.average_montly_hours,
              hue=hr_fe_2.average_montly_hours_bin,
              palette = sns.color_palette("hls", 7),
              dodge = False)
plt.tight_layout()
In [72]:
hr_fe_3 = hr_fe_2.copy()
hr_fe_3 = onehot_encode(hr_fe_3)
hr_fe_3.drop('average_montly_hours', inplace=True, axis=1)
In [73]:
X_fe_3, y_fe_3, X_fe_3_train, X_fe_3_test, y_fe_3_train, y_fe_3_test = split_dataset(hr_fe_3, target, split_ratio, seed)
cv_acc(lr, X_fe_3_train, y_fe_3_train, 10, seed)
print()
lr_run(lr, X_fe_3_train, y_fe_3_train, X_fe_3_test, y_fe_3_test)
10-fold cross validation average accuracy: 0.944

Iteration  1 | Accuracy: 0.95
Iteration  2 | Accuracy: 0.94
Iteration  3 | Accuracy: 0.94
Iteration  4 | Accuracy: 0.94
Iteration  5 | Accuracy: 0.95
Iteration  6 | Accuracy: 0.94
Iteration  7 | Accuracy: 0.95
Iteration  8 | Accuracy: 0.95
Iteration  9 | Accuracy: 0.94
Iteration 10 | Accuracy: 0.93

Accuracy on test: 0.945

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      3435
           1       0.91      0.86      0.88      1065

   micro avg       0.95      0.95      0.95      4500
   macro avg       0.93      0.92      0.92      4500
weighted avg       0.94      0.95      0.94      4500

Confusion Matrix:
[[3340   95]
 [ 151  914]]

                                Feature     Coef.
0                            intercept. -4.881973
1                        number_project  0.162148
2                    time_spend_company  0.452608
3                         Work_accident -1.155099
4                 promotion_last_5years -0.830394
5                                salary -0.709859
6                         department_IT -0.048903
7                      department_RandD -0.288485
8                 department_accounting  0.009802
9                         department_hr  0.540293
10                department_management -0.627491
11                 department_marketing -0.043924
12               department_product_mng -0.116596
13                     department_sales  0.026135
14                   department_support  0.265639
15                 department_technical  0.279711
16  satisfaction_level_bin_(0.00, 0.11]  4.670240
17  satisfaction_level_bin_(0.11, 0.35] -1.423270
18  satisfaction_level_bin_(0.35, 0.46]  3.392828
19  satisfaction_level_bin_(0.46, 0.71] -2.386602
20  satisfaction_level_bin_(0.71, 0.92] -0.190840
21  satisfaction_level_bin_(0.92, 1.00] -4.066175
22     last_evaluation_bin_(0.00, 0.44] -3.201445
23     last_evaluation_bin_(0.44, 0.57]  1.852081
24     last_evaluation_bin_(0.57, 0.76] -0.575362
25     last_evaluation_bin_(0.76, 1.00]  1.920907
26    average_montly_hours_bin_(0, 125] -4.210893
27  average_montly_hours_bin_(125, 131]  0.991261
28  average_montly_hours_bin_(131, 161]  0.340244
29  average_montly_hours_bin_(161, 216] -2.014307
30  average_montly_hours_bin_(216, 274]  0.638304
31  average_montly_hours_bin_(274, 287] -0.080350
32  average_montly_hours_bin_(287, 310]  4.331923

Categorize Number of Projects

Based on the EDA, the Number of Projects can be categorized into 4 categories.

In [74]:
categ = {2:'too low', 3:'normal', 4:'normal', 5:'normal', 6:'too high', 7:'extreme'}
hr_fe['number_project_cat'] = hr_fe.number_project.map(categ)
hr_fe_3['number_project_cat'] = hr_fe_3.number_project.map(categ)
hr_fe_3.number_project_cat.value_counts()
Out[74]:
normal      11181
too low      2388
too high     1174
extreme       256
Name: number_project_cat, dtype: int64
In [75]:
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe_3.number_project,
              hue=hr_fe_3.number_project_cat,
              palette = sns.color_palette("hls", 6),
              dodge = False)
plt.tight_layout()
In [76]:
hr_fe_4 = hr_fe_3.copy()
hr_fe_4 = onehot_encode(hr_fe_4)
hr_fe_4.drop('number_project', inplace=True, axis=1)
In [77]:
X_fe_4, y_fe_4, X_fe_4_train, X_fe_4_test, y_fe_4_train, y_fe_4_test = split_dataset(hr_fe_4, target, split_ratio, seed)
cv_acc(lr, X_fe_4_train, y_fe_4_train, 10, seed)
print()
lr_run(lr, X_fe_4_train, y_fe_4_train, X_fe_4_test, y_fe_4_test)
10-fold cross validation average accuracy: 0.947

Iteration  1 | Accuracy: 0.94
Iteration  2 | Accuracy: 0.94
Iteration  3 | Accuracy: 0.94
Iteration  4 | Accuracy: 0.95
Iteration  5 | Accuracy: 0.96
Iteration  6 | Accuracy: 0.94
Iteration  7 | Accuracy: 0.96
Iteration  8 | Accuracy: 0.96
Iteration  9 | Accuracy: 0.94
Iteration 10 | Accuracy: 0.94

Accuracy on test: 0.950

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      3435
           1       0.90      0.88      0.89      1065

   micro avg       0.95      0.95      0.95      4500
   macro avg       0.93      0.93      0.93      4500
weighted avg       0.95      0.95      0.95      4500

Confusion Matrix:
[[3333  102]
 [ 125  940]]

                                Feature     Coef.
0                            intercept. -2.841180
1                    time_spend_company  0.507699
2                         Work_accident -1.201862
3                 promotion_last_5years -0.837989
4                                salary -0.709490
5                         department_IT -0.031500
6                      department_RandD -0.186930
7                 department_accounting  0.006741
8                         department_hr  0.623789
9                 department_management -0.707494
10                 department_marketing -0.097607
11               department_product_mng -0.202667
12                     department_sales  0.000056
13                   department_support  0.326933
14                 department_technical  0.270181
15  satisfaction_level_bin_(0.00, 0.11]  4.830750
16  satisfaction_level_bin_(0.11, 0.35] -1.270735
17  satisfaction_level_bin_(0.35, 0.46]  2.425329
18  satisfaction_level_bin_(0.46, 0.71] -2.250033
19  satisfaction_level_bin_(0.71, 0.92]  0.187877
20  satisfaction_level_bin_(0.92, 1.00] -3.921687
21     last_evaluation_bin_(0.00, 0.44] -2.975143
22     last_evaluation_bin_(0.44, 0.57]  1.472914
23     last_evaluation_bin_(0.57, 0.76] -0.497946
24     last_evaluation_bin_(0.76, 1.00]  2.001675
25    average_montly_hours_bin_(0, 125] -4.037054
26  average_montly_hours_bin_(125, 131]  0.708635
27  average_montly_hours_bin_(131, 161]  0.080379
28  average_montly_hours_bin_(161, 216] -1.803695
29  average_montly_hours_bin_(216, 274]  0.735678
30  average_montly_hours_bin_(274, 287] -0.077393
31  average_montly_hours_bin_(287, 310]  4.394951
32           number_project_cat_extreme  3.872547
33            number_project_cat_normal -2.153412
34          number_project_cat_too high -1.859039
35           number_project_cat_too low  0.141405

Categorize Time Spent in Company

Based on the EDA, the Time Spent in Company can be categorized into 4 categories, related to the rate of departure.

In [78]:
categ = {2:'low departure', 3:'high departure', 4:'high departure', 5:'very high departure', 6:'high departure', 7:'no departure', 8:'no departure', 10:'no departure'}
hr_fe['time_spend_company_cat'] = hr_fe.time_spend_company.map(categ)
hr_fe_4['time_spend_company_cat'] = hr_fe_4.time_spend_company.map(categ)
hr_fe_4.time_spend_company_cat.value_counts()
Out[78]:
high departure         9718
low departure          3244
very high departure    1473
no departure            564
Name: time_spend_company_cat, dtype: int64
In [79]:
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe_4.time_spend_company,
              hue=hr_fe_4.time_spend_company_cat,
              palette = sns.color_palette("hls", 7),
              dodge = False)
plt.tight_layout()
In [80]:
hr_fe_5 = hr_fe_4.copy()
hr_fe_5 = onehot_encode(hr_fe_5)
hr_fe_5.drop('time_spend_company', inplace=True, axis=1)
In [81]:
X_fe_5, y_fe_5, X_fe_5_train, X_fe_5_test, y_fe_5_train, y_fe_5_test = split_dataset(hr_fe_5, target, split_ratio, seed)
cv_acc(lr, X_fe_5_train, y_fe_5_train, 10, seed)
print()
lr_run(lr, X_fe_5_train, y_fe_5_train, X_fe_5_test, y_fe_5_test)
10-fold cross validation average accuracy: 0.956

Iteration  1 | Accuracy: 0.95
Iteration  2 | Accuracy: 0.94
Iteration  3 | Accuracy: 0.95
Iteration  4 | Accuracy: 0.96
Iteration  5 | Accuracy: 0.96
Iteration  6 | Accuracy: 0.96
Iteration  7 | Accuracy: 0.96
Iteration  8 | Accuracy: 0.96
Iteration  9 | Accuracy: 0.96
Iteration 10 | Accuracy: 0.95

Accuracy on test: 0.956

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3435
           1       0.93      0.88      0.91      1065

   micro avg       0.96      0.96      0.96      4500
   macro avg       0.95      0.93      0.94      4500
weighted avg       0.96      0.96      0.96      4500

Confusion Matrix:
[[3362   73]
 [ 124  941]]

                                       Feature     Coef.
0                                   intercept. -1.288513
1                                Work_accident -1.210856
2                        promotion_last_5years -0.454837
3                                       salary -0.672500
4                                department_IT -0.235474
5                             department_RandD -0.395298
6                        department_accounting -0.029671
7                                department_hr  0.510471
8                        department_management -0.297698
9                         department_marketing  0.143294
10                      department_product_mng -0.227719
11                            department_sales  0.001829
12                          department_support  0.350340
13                        department_technical  0.179556
14         satisfaction_level_bin_(0.00, 0.11]  5.056556
15         satisfaction_level_bin_(0.11, 0.35] -1.622557
16         satisfaction_level_bin_(0.35, 0.46]  2.196762
17         satisfaction_level_bin_(0.46, 0.71] -1.869052
18         satisfaction_level_bin_(0.71, 0.92]  0.005086
19         satisfaction_level_bin_(0.92, 1.00] -3.767165
20            last_evaluation_bin_(0.00, 0.44] -2.659269
21            last_evaluation_bin_(0.44, 0.57]  1.342359
22            last_evaluation_bin_(0.57, 0.76] -0.295577
23            last_evaluation_bin_(0.76, 1.00]  1.612117
24           average_montly_hours_bin_(0, 125] -4.064773
25         average_montly_hours_bin_(125, 131]  0.755401
26         average_montly_hours_bin_(131, 161]  0.213408
27         average_montly_hours_bin_(161, 216] -1.742236
28         average_montly_hours_bin_(216, 274]  0.596288
29         average_montly_hours_bin_(274, 287]  0.086931
30         average_montly_hours_bin_(287, 310]  4.154612
31                  number_project_cat_extreme  3.500939
32                   number_project_cat_normal -1.998452
33                 number_project_cat_too high -1.610600
34                  number_project_cat_too low  0.107742
35       time_spend_company_cat_high departure  0.357731
36        time_spend_company_cat_low departure -1.265409
37         time_spend_company_cat_no departure -2.014069
38  time_spend_company_cat_very high departure  2.921377

Cluster by Number of Projects and Average Monthly Hours

Based on the EDA, the employees can be clustered by Workload, based on the Number of Projects and Average Monthly Hours, into 5 categories.

In [82]:
def workload_cluster(row):
    if (row['average_montly_hours_bin'] == '(0, 125]'):
        return 'very low'
    if (row['number_project'] <= 2) and (row['average_montly_hours_bin'] in ['(125, 131]','(131, 161]']):
        return 'low'
    if (row['number_project'] >= 4) and (row['average_montly_hours_bin'] in ['(216, 274]','(274, 287]']):
        return 'high'
    if (row['average_montly_hours_bin'] in ['(287, 310]']):
        return 'extreme'
    return 'normal'

hr_fe['workload'] = hr_fe.apply(lambda row: workload_cluster(row), axis=1)
hr_fe.workload.value_counts()
Out[82]:
normal      8265
high        4209
low         1709
very low     486
extreme      330
Name: workload, dtype: int64
In [83]:
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr_fe.average_montly_hours,
                y=hr_fe.number_project,
                hue=hr_fe.workload,
                palette = sns.color_palette("hls", 5))
plt.tight_layout()
In [84]:
hr_fe_6 = hr_fe.copy()
hr_fe_6 = onehot_encode(hr_fe_6)
hr_fe_6.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_6.drop('last_evaluation', inplace=True, axis=1)
hr_fe_6.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_6.drop('number_project', inplace=True, axis=1)
hr_fe_6.drop('time_spend_company', inplace=True, axis=1)
In [85]:
X_fe_6, y_fe_6, X_fe_6_train, X_fe_6_test, y_fe_6_train, y_fe_6_test = split_dataset(hr_fe_6, target, split_ratio, seed)
cv_acc(lr, X_fe_6_train, y_fe_6_train, 10, seed)
print()
lr_run(lr, X_fe_6_train, y_fe_6_train, X_fe_6_test, y_fe_6_test)
10-fold cross validation average accuracy: 0.958

Iteration  1 | Accuracy: 0.95
Iteration  2 | Accuracy: 0.94
Iteration  3 | Accuracy: 0.95
Iteration  4 | Accuracy: 0.96
Iteration  5 | Accuracy: 0.97
Iteration  6 | Accuracy: 0.96
Iteration  7 | Accuracy: 0.96
Iteration  8 | Accuracy: 0.97
Iteration  9 | Accuracy: 0.96
Iteration 10 | Accuracy: 0.95

Accuracy on test: 0.959

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3435
           1       0.94      0.88      0.91      1065

   micro avg       0.96      0.96      0.96      4500
   macro avg       0.95      0.93      0.94      4500
weighted avg       0.96      0.96      0.96      4500

Confusion Matrix:
[[3377   58]
 [ 125  940]]

                                       Feature     Coef.
0                                   intercept. -0.766901
1                                Work_accident -1.173201
2                        promotion_last_5years -0.439302
3                                       salary -0.662271
4                                department_IT -0.297132
5                             department_RandD -0.447797
6                        department_accounting  0.000741
7                                department_hr  0.458777
8                        department_management -0.164455
9                         department_marketing  0.048457
10                      department_product_mng -0.187570
11                            department_sales  0.034650
12                          department_support  0.347782
13                        department_technical  0.205697
14                  number_project_cat_extreme  3.487827
15                   number_project_cat_normal -1.632124
16                 number_project_cat_too high -1.306443
17                  number_project_cat_too low -0.550111
18           average_montly_hours_bin_(0, 125] -2.276070
19         average_montly_hours_bin_(125, 131]  0.579145
20         average_montly_hours_bin_(131, 161]  0.135179
21         average_montly_hours_bin_(161, 216] -0.624238
22         average_montly_hours_bin_(216, 274] -0.014268
23         average_montly_hours_bin_(274, 287] -0.150833
24         average_montly_hours_bin_(287, 310]  2.350234
25         satisfaction_level_bin_(0.00, 0.11]  4.765910
26         satisfaction_level_bin_(0.11, 0.35] -1.400822
27         satisfaction_level_bin_(0.35, 0.46]  1.637669
28         satisfaction_level_bin_(0.46, 0.71] -1.633800
29         satisfaction_level_bin_(0.71, 0.92]  0.169115
30         satisfaction_level_bin_(0.92, 1.00] -3.538922
31            last_evaluation_bin_(0.00, 0.44] -2.541264
32            last_evaluation_bin_(0.44, 0.57]  1.163123
33            last_evaluation_bin_(0.57, 0.76] -0.196011
34            last_evaluation_bin_(0.76, 1.00]  1.573302
35       time_spend_company_cat_high departure  0.289187
36        time_spend_company_cat_low departure -1.110152
37         time_spend_company_cat_no departure -1.839473
38  time_spend_company_cat_very high departure  2.659588
39                            workload_extreme  2.350234
40                               workload_high  0.104323
41                                workload_low  1.471144
42                             workload_normal -1.650481
43                           workload_very low -2.276070

Cluster by Number of Projects and Last Evaluation

Based on the EDA, the employees can be clustered by Project Performance, based on the Number of Projects and Last Evaluation, into 4 categories.

In [86]:
def project_performance_cluster(row):
    if (row['last_evaluation_bin'] == '(0.00, 0.44]'):
        return 'very low'
    if (row['number_project'] <= 2) and (row['last_evaluation_bin'] in ['(0.44, 0.57]']):
        return 'low'
    if (row['number_project'] >= 4) and (row['last_evaluation_bin'] in ['(0.76, 1.00]']):
        return 'high'
    return 'normal'

hr_fe['project_performance'] = hr_fe.apply(lambda row: project_performance_cluster(row), axis=1)
hr_fe.project_performance.value_counts()
Out[86]:
normal      8245
high        4589
low         1720
very low     445
Name: project_performance, dtype: int64
In [87]:
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr_fe.last_evaluation,
                y=hr_fe.number_project,
                hue=hr_fe.project_performance,
                palette = sns.color_palette("hls", 4))
plt.tight_layout()
In [88]:
hr_fe_7 = hr_fe.copy()
hr_fe_7 = onehot_encode(hr_fe_7)
hr_fe_7.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_7.drop('last_evaluation', inplace=True, axis=1)
hr_fe_7.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_7.drop('number_project', inplace=True, axis=1)
hr_fe_7.drop('time_spend_company', inplace=True, axis=1)
In [89]:
X_fe_7, y_fe_7, X_fe_7_train, X_fe_7_test, y_fe_7_train, y_fe_7_test = split_dataset(hr_fe_7, target, split_ratio, seed)
cv_acc(lr, X_fe_7_train, y_fe_7_train, 10, seed)
print()
lr_run(lr, X_fe_7_train, y_fe_7_train, X_fe_7_test, y_fe_7_test)
10-fold cross validation average accuracy: 0.960

Iteration  1 | Accuracy: 0.96
Iteration  2 | Accuracy: 0.95
Iteration  3 | Accuracy: 0.96
Iteration  4 | Accuracy: 0.96
Iteration  5 | Accuracy: 0.97
Iteration  6 | Accuracy: 0.96
Iteration  7 | Accuracy: 0.96
Iteration  8 | Accuracy: 0.96
Iteration  9 | Accuracy: 0.96
Iteration 10 | Accuracy: 0.95

Accuracy on test: 0.958

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3435
           1       0.93      0.88      0.91      1065

   micro avg       0.96      0.96      0.96      4500
   macro avg       0.95      0.93      0.94      4500
weighted avg       0.96      0.96      0.96      4500

Confusion Matrix:
[[3368   67]
 [ 123  942]]

                                       Feature     Coef.
0                                   intercept. -0.304227
1                                Work_accident -1.223252
2                        promotion_last_5years -0.510657
3                                       salary -0.639244
4                                department_IT -0.308566
5                             department_RandD -0.427170
6                        department_accounting -0.113093
7                                department_hr  0.396920
8                        department_management -0.146822
9                         department_marketing  0.112515
10                      department_product_mng -0.140261
11                            department_sales  0.023193
12                          department_support  0.369251
13                        department_technical  0.234417
14                  number_project_cat_extreme  3.644086
15                   number_project_cat_normal -1.391861
16                 number_project_cat_too high -1.002857
17                  number_project_cat_too low -1.248984
18           average_montly_hours_bin_(0, 125] -2.150941
19         average_montly_hours_bin_(125, 131]  0.310555
20         average_montly_hours_bin_(131, 161] -0.065836
21         average_montly_hours_bin_(161, 216] -0.782745
22         average_montly_hours_bin_(216, 274]  0.117264
23         average_montly_hours_bin_(274, 287]  0.166471
24         average_montly_hours_bin_(287, 310]  2.405614
25         satisfaction_level_bin_(0.00, 0.11]  4.679780
26         satisfaction_level_bin_(0.11, 0.35] -1.331062
27         satisfaction_level_bin_(0.35, 0.46]  1.205874
28         satisfaction_level_bin_(0.46, 0.71] -1.514709
29         satisfaction_level_bin_(0.71, 0.92]  0.241973
30         satisfaction_level_bin_(0.92, 1.00] -3.281472
31                    project_performance_high  0.246351
32                     project_performance_low  2.100090
33                  project_performance_normal -0.873446
34                project_performance_very low -1.472612
35            last_evaluation_bin_(0.00, 0.44] -1.472612
36            last_evaluation_bin_(0.44, 0.57]  0.498097
37            last_evaluation_bin_(0.57, 0.76]  0.165295
38            last_evaluation_bin_(0.76, 1.00]  0.809603
39       time_spend_company_cat_high departure  0.299676
40        time_spend_company_cat_low departure -1.065227
41         time_spend_company_cat_no departure -1.923445
42  time_spend_company_cat_very high departure  2.689379
43                            workload_extreme  2.405614
44                               workload_high -0.219293
45                                workload_low  1.266443
46                             workload_normal -1.301441
47                           workload_very low -2.150941

Cluster by Last Evaluation and Average Monthly Hours

Based on the EDA, the employees can be clustered by Efficiency, based on the Last Evaluation and the Average Monthly Hours, into 4 categories.

In [90]:
def efficiency_cluster(row):
    if (row['last_evaluation_bin'] == '(0.00, 0.44]'):
        return 'very low'
    if (row['average_montly_hours_bin'] in ['(0, 125]']):
        return 'very low'
    if (row['last_evaluation_bin'] in ['(0.44, 0.57]']) and (row['average_montly_hours_bin'] in ['(125, 131]', '(131, 161]']):
        return 'low'
    if (row['last_evaluation_bin'] in ['(0.76, 1.00]']) and (row['average_montly_hours_bin'] in ['(216, 274]', '(274, 287]','(287, 310]']):
        return 'high'
    return 'normal'

hr_fe['efficiency'] = hr_fe.apply(lambda row: efficiency_cluster(row), axis=1)
hr_fe.efficiency.value_counts()
Out[90]:
normal      8436
high        3719
low         1994
very low     850
Name: efficiency, dtype: int64
In [91]:
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr_fe.average_montly_hours,
                y=hr_fe.last_evaluation,
                hue=hr_fe.efficiency,
                palette = sns.color_palette("hls", 4))
plt.tight_layout()
In [92]:
hr_fe_8 = hr_fe.copy()
hr_fe_8 = onehot_encode(hr_fe_8)
hr_fe_8.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_8.drop('last_evaluation', inplace=True, axis=1)
hr_fe_8.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_8.drop('number_project', inplace=True, axis=1)
hr_fe_8.drop('time_spend_company', inplace=True, axis=1)
In [93]:
X_fe_8, y_fe_8, X_fe_8_train, X_fe_8_test, y_fe_8_train, y_fe_8_test = split_dataset(hr_fe_8, target, split_ratio, seed)
cv_acc(lr, X_fe_8_train, y_fe_8_train, 10, seed)
print()
lr_run(lr, X_fe_8_train, y_fe_8_train, X_fe_8_test, y_fe_8_test)
10-fold cross validation average accuracy: 0.960

Iteration  1 | Accuracy: 0.96
Iteration  2 | Accuracy: 0.95
Iteration  3 | Accuracy: 0.96
Iteration  4 | Accuracy: 0.96
Iteration  5 | Accuracy: 0.97
Iteration  6 | Accuracy: 0.96
Iteration  7 | Accuracy: 0.96
Iteration  8 | Accuracy: 0.96
Iteration  9 | Accuracy: 0.96
Iteration 10 | Accuracy: 0.95

Accuracy on test: 0.960

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3435
           1       0.94      0.88      0.91      1065

   micro avg       0.96      0.96      0.96      4500
   macro avg       0.95      0.93      0.94      4500
weighted avg       0.96      0.96      0.96      4500

Confusion Matrix:
[[3377   58]
 [ 124  941]]

                                       Feature     Coef.
0                                   intercept.  0.110312
1                                Work_accident -1.234954
2                        promotion_last_5years -0.581323
3                                       salary -0.653274
4                                department_IT -0.319980
5                             department_RandD -0.444509
6                        department_accounting -0.118532
7                                department_hr  0.420489
8                        department_management -0.156571
9                         department_marketing  0.097993
10                      department_product_mng -0.141090
11                            department_sales  0.034649
12                          department_support  0.373495
13                        department_technical  0.253988
14                  number_project_cat_extreme  3.511605
15                   number_project_cat_normal -1.541114
16                 number_project_cat_too high -1.025555
17                  number_project_cat_too low -0.945003
18           average_montly_hours_bin_(0, 125] -1.341976
19         average_montly_hours_bin_(125, 131]  0.122886
20         average_montly_hours_bin_(131, 161] -0.304675
21         average_montly_hours_bin_(161, 216] -0.571683
22         average_montly_hours_bin_(216, 274] -0.235870
23         average_montly_hours_bin_(274, 287] -0.046850
24         average_montly_hours_bin_(287, 310]  2.378101
25         satisfaction_level_bin_(0.00, 0.11]  4.547034
26         satisfaction_level_bin_(0.11, 0.35] -1.297088
27         satisfaction_level_bin_(0.35, 0.46]  1.129827
28         satisfaction_level_bin_(0.46, 0.71] -1.465960
29         satisfaction_level_bin_(0.71, 0.92]  0.271559
30         satisfaction_level_bin_(0.92, 1.00] -3.185440
31                    project_performance_high  0.179345
32                     project_performance_low  1.434834
33                  project_performance_normal -0.824305
34                project_performance_very low -0.789941
35            last_evaluation_bin_(0.00, 0.44] -0.789941
36            last_evaluation_bin_(0.44, 0.57]  0.075586
37            last_evaluation_bin_(0.57, 0.76]  0.324766
38            last_evaluation_bin_(0.76, 1.00]  0.389521
39                             efficiency_high  0.730402
40                              efficiency_low  1.642109
41                           efficiency_normal -0.284602
42                         efficiency_very low -2.087977
43       time_spend_company_cat_high departure  0.291585
44        time_spend_company_cat_low departure -1.079268
45         time_spend_company_cat_no departure -1.877955
46  time_spend_company_cat_very high departure  2.665570
47                            workload_extreme  2.378101
48                               workload_high -0.310824
49                                workload_low  0.498769
50                             workload_normal -1.224138
51                           workload_very low -1.341976

Cluster by Last Evaluation and Satisfaction Level

Based on the EDA, the employees can be clustered by Attitude, based on the Last Evaluation and the Satisfaction Level, into 7 categories.

In [94]:
def attitude_cluster(row):
    if (row['last_evaluation_bin'] == '(0.00, 0.44]'):
        return 'low performance'
    if (row['satisfaction_level_bin'] in ['(0.92, 1.00]']):
        return 'very happy'
    if (row['last_evaluation_bin'] in ['(0.76, 1.00]']) and (row['satisfaction_level_bin'] in ['(0.71, 0.92]']):
        return 'happy and high performance'
    if (row['last_evaluation_bin'] in ['(0.44, 0.57]']) and (row['satisfaction_level_bin'] in ['(0.35, 0.46]']):
        return 'unhappy and low performance'
    if (row['satisfaction_level_bin'] in ['(0.00, 0.11]']):
        return 'very unhappy'
    if (row['satisfaction_level_bin'] in ['(0.11, 0.35]','(0.35, 0.46]']):
        return 'unhappy'
    return 'normal'

hr_fe['attitude'] = hr_fe.apply(lambda row: attitude_cluster(row), axis=1)
hr_fe.attitude.value_counts()
Out[94]:
normal                         6668
happy and high performance     2553
unhappy and low performance    1635
unhappy                        1474
very happy                     1336
very unhappy                    888
low performance                 445
Name: attitude, dtype: int64
In [95]:
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr_fe.satisfaction_level,
                y=hr_fe.last_evaluation,
                hue=hr_fe.attitude,
                palette = sns.color_palette("hls", 7))
plt.tight_layout()
In [96]:
hr_fe_9 = hr_fe.copy()
hr_fe_9 = onehot_encode(hr_fe_9)
hr_fe_9.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_9.drop('last_evaluation', inplace=True, axis=1)
hr_fe_9.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_9.drop('number_project', inplace=True, axis=1)
hr_fe_9.drop('time_spend_company', inplace=True, axis=1)
In [97]:
X_fe_9, y_fe_9, X_fe_9_train, X_fe_9_test, y_fe_9_train, y_fe_9_test = split_dataset(hr_fe_9, target, split_ratio, seed)
cv_acc(lr, X_fe_9_train, y_fe_9_train, 10, seed)
print()
lr_run(lr, X_fe_9_train, y_fe_9_train, X_fe_9_test, y_fe_9_test)
10-fold cross validation average accuracy: 0.964

Iteration  1 | Accuracy: 0.96
Iteration  2 | Accuracy: 0.95
Iteration  3 | Accuracy: 0.96
Iteration  4 | Accuracy: 0.97
Iteration  5 | Accuracy: 0.97
Iteration  6 | Accuracy: 0.97
Iteration  7 | Accuracy: 0.97
Iteration  8 | Accuracy: 0.96
Iteration  9 | Accuracy: 0.96
Iteration 10 | Accuracy: 0.96

Accuracy on test: 0.964

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      3435
           1       0.94      0.90      0.92      1065

   micro avg       0.96      0.96      0.96      4500
   macro avg       0.96      0.94      0.95      4500
weighted avg       0.96      0.96      0.96      4500

Confusion Matrix:
[[3379   56]
 [ 108  957]]

                                       Feature     Coef.
0                                   intercept.  0.155601
1                                Work_accident -1.143174
2                        promotion_last_5years -0.597843
3                                       salary -0.652169
4                                department_IT -0.355823
5                             department_RandD -0.441449
6                        department_accounting -0.095917
7                                department_hr  0.447624
8                        department_management -0.163427
9                         department_marketing  0.093569
10                      department_product_mng -0.171977
11                            department_sales  0.034884
12                          department_support  0.366134
13                        department_technical  0.288471
14                  number_project_cat_extreme  3.202322
15                   number_project_cat_normal -1.490338
16                 number_project_cat_too high -0.978540
17                  number_project_cat_too low -0.731354
18           average_montly_hours_bin_(0, 125] -1.380806
19         average_montly_hours_bin_(125, 131]  0.087262
20         average_montly_hours_bin_(131, 161] -0.283742
21         average_montly_hours_bin_(161, 216] -0.697097
22         average_montly_hours_bin_(216, 274] -0.167288
23         average_montly_hours_bin_(274, 287]  0.161469
24         average_montly_hours_bin_(287, 310]  2.282292
25         satisfaction_level_bin_(0.00, 0.11]  2.651998
26         satisfaction_level_bin_(0.11, 0.35] -0.522950
27         satisfaction_level_bin_(0.35, 0.46]  0.552467
28         satisfaction_level_bin_(0.46, 0.71] -0.530850
29         satisfaction_level_bin_(0.71, 0.92] -0.200445
30         satisfaction_level_bin_(0.92, 1.00] -1.948131
31                    project_performance_high  0.419734
32                     project_performance_low  0.855019
33                  project_performance_normal -0.579259
34                project_performance_very low -0.693405
35            last_evaluation_bin_(0.00, 0.44] -0.693405
36            last_evaluation_bin_(0.44, 0.57]  0.121621
37            last_evaluation_bin_(0.57, 0.76]  0.748939
38            last_evaluation_bin_(0.76, 1.00] -0.175065
39                             efficiency_high  0.631356
40                              efficiency_low  1.559049
41                           efficiency_normal -0.140877
42                         efficiency_very low -2.047439
43       time_spend_company_cat_high departure  0.258945
44        time_spend_company_cat_low departure -1.051914
45         time_spend_company_cat_no departure -1.793281
46  time_spend_company_cat_very high departure  2.588338
47         attitude_happy and high performance  0.807235
48                    attitude_low performance -0.693405
49                             attitude_normal -1.154902
50                            attitude_unhappy -1.048763
51        attitude_unhappy and low performance  1.378160
52                         attitude_very happy -1.938233
53                       attitude_very unhappy  2.651998
54                            workload_extreme  2.282292
55                               workload_high -0.227062
56                                workload_low  0.383963
57                             workload_normal -1.056297
58                           workload_very low -1.380806

Removing Unbinned Variables and Encoding New Features

The variables which have been binned are removed from the dataset, and new features are one hot encoded.

In [98]:
hr_fe_encoded = onehot_encode(hr_fe)
hr_fe_encoded.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_encoded.drop('last_evaluation', inplace=True, axis=1)
hr_fe_encoded.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_encoded.drop('number_project', inplace=True, axis=1)
hr_fe_encoded.drop('time_spend_company', inplace=True, axis=1)
df_desc(hr_fe_encoded)
Out[98]:
dtype NAs Numerical Boolean Categorical
Work_accident int64 0 False True False
left int64 0 False True False
promotion_last_5years int64 0 False True False
salary int64 0 True False False
department_IT uint8 0 False True False
department_RandD uint8 0 False True False
department_accounting uint8 0 False True False
department_hr uint8 0 False True False
department_management uint8 0 False True False
department_marketing uint8 0 False True False
department_product_mng uint8 0 False True False
department_sales uint8 0 False True False
department_support uint8 0 False True False
department_technical uint8 0 False True False
number_project_cat_extreme uint8 0 False True False
number_project_cat_normal uint8 0 False True False
number_project_cat_too high uint8 0 False True False
number_project_cat_too low uint8 0 False True False
average_montly_hours_bin_(0, 125] uint8 0 False True False
average_montly_hours_bin_(125, 131] uint8 0 False True False
average_montly_hours_bin_(131, 161] uint8 0 False True False
average_montly_hours_bin_(161, 216] uint8 0 False True False
average_montly_hours_bin_(216, 274] uint8 0 False True False
average_montly_hours_bin_(274, 287] uint8 0 False True False
average_montly_hours_bin_(287, 310] uint8 0 False True False
satisfaction_level_bin_(0.00, 0.11] uint8 0 False True False
satisfaction_level_bin_(0.11, 0.35] uint8 0 False True False
satisfaction_level_bin_(0.35, 0.46] uint8 0 False True False
satisfaction_level_bin_(0.46, 0.71] uint8 0 False True False
satisfaction_level_bin_(0.71, 0.92] uint8 0 False True False
satisfaction_level_bin_(0.92, 1.00] uint8 0 False True False
project_performance_high uint8 0 False True False
project_performance_low uint8 0 False True False
project_performance_normal uint8 0 False True False
project_performance_very low uint8 0 False True False
last_evaluation_bin_(0.00, 0.44] uint8 0 False True False
last_evaluation_bin_(0.44, 0.57] uint8 0 False True False
last_evaluation_bin_(0.57, 0.76] uint8 0 False True False
last_evaluation_bin_(0.76, 1.00] uint8 0 False True False
efficiency_high uint8 0 False True False
efficiency_low uint8 0 False True False
efficiency_normal uint8 0 False True False
efficiency_very low uint8 0 False True False
time_spend_company_cat_high departure uint8 0 False True False
time_spend_company_cat_low departure uint8 0 False True False
time_spend_company_cat_no departure uint8 0 False True False
time_spend_company_cat_very high departure uint8 0 False True False
attitude_happy and high performance uint8 0 False True False
attitude_low performance uint8 0 False True False
attitude_normal uint8 0 False True False
attitude_unhappy uint8 0 False True False
attitude_unhappy and low performance uint8 0 False True False
attitude_very happy uint8 0 False True False
attitude_very unhappy uint8 0 False True False
workload_extreme uint8 0 False True False
workload_high uint8 0 False True False
workload_low uint8 0 False True False
workload_normal uint8 0 False True False
workload_very low uint8 0 False True False

Features Selection

The dataset resulting from the Feature Engineering phase contains 58 features, with a model reaching the accuracy of 0.964. The Feature Selection phase aims to reduce the number of variables used by the model.

In [99]:
X_fe_encoded, y_fe_encoded, X_fe_encoded_train, X_fe_encoded_test, y_fe_encoded_train, y_fe_encoded_test = split_dataset(hr_fe_encoded, target, split_ratio, seed)
cv_acc(lr, X_fe_encoded_train, y_fe_encoded_train, 10, seed)
print()
lr_run(lr, X_fe_encoded_train, y_fe_encoded_train, X_fe_encoded_test, y_fe_encoded_test)
10-fold cross validation average accuracy: 0.964

Iteration  1 | Accuracy: 0.96
Iteration  2 | Accuracy: 0.95
Iteration  3 | Accuracy: 0.96
Iteration  4 | Accuracy: 0.97
Iteration  5 | Accuracy: 0.97
Iteration  6 | Accuracy: 0.97
Iteration  7 | Accuracy: 0.97
Iteration  8 | Accuracy: 0.96
Iteration  9 | Accuracy: 0.96
Iteration 10 | Accuracy: 0.96

Accuracy on test: 0.964

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      3435
           1       0.94      0.90      0.92      1065

   micro avg       0.96      0.96      0.96      4500
   macro avg       0.96      0.94      0.95      4500
weighted avg       0.96      0.96      0.96      4500

Confusion Matrix:
[[3379   56]
 [ 108  957]]

                                       Feature     Coef.
0                                   intercept.  0.155601
1                                Work_accident -1.143174
2                        promotion_last_5years -0.597843
3                                       salary -0.652169
4                                department_IT -0.355823
5                             department_RandD -0.441449
6                        department_accounting -0.095917
7                                department_hr  0.447624
8                        department_management -0.163427
9                         department_marketing  0.093569
10                      department_product_mng -0.171977
11                            department_sales  0.034884
12                          department_support  0.366134
13                        department_technical  0.288471
14                  number_project_cat_extreme  3.202322
15                   number_project_cat_normal -1.490338
16                 number_project_cat_too high -0.978540
17                  number_project_cat_too low -0.731354
18           average_montly_hours_bin_(0, 125] -1.380806
19         average_montly_hours_bin_(125, 131]  0.087262
20         average_montly_hours_bin_(131, 161] -0.283742
21         average_montly_hours_bin_(161, 216] -0.697097
22         average_montly_hours_bin_(216, 274] -0.167288
23         average_montly_hours_bin_(274, 287]  0.161469
24         average_montly_hours_bin_(287, 310]  2.282292
25         satisfaction_level_bin_(0.00, 0.11]  2.651998
26         satisfaction_level_bin_(0.11, 0.35] -0.522950
27         satisfaction_level_bin_(0.35, 0.46]  0.552467
28         satisfaction_level_bin_(0.46, 0.71] -0.530850
29         satisfaction_level_bin_(0.71, 0.92] -0.200445
30         satisfaction_level_bin_(0.92, 1.00] -1.948131
31                    project_performance_high  0.419734
32                     project_performance_low  0.855019
33                  project_performance_normal -0.579259
34                project_performance_very low -0.693405
35            last_evaluation_bin_(0.00, 0.44] -0.693405
36            last_evaluation_bin_(0.44, 0.57]  0.121621
37            last_evaluation_bin_(0.57, 0.76]  0.748939
38            last_evaluation_bin_(0.76, 1.00] -0.175065
39                             efficiency_high  0.631356
40                              efficiency_low  1.559049
41                           efficiency_normal -0.140877
42                         efficiency_very low -2.047439
43       time_spend_company_cat_high departure  0.258945
44        time_spend_company_cat_low departure -1.051914
45         time_spend_company_cat_no departure -1.793281
46  time_spend_company_cat_very high departure  2.588338
47         attitude_happy and high performance  0.807235
48                    attitude_low performance -0.693405
49                             attitude_normal -1.154902
50                            attitude_unhappy -1.048763
51        attitude_unhappy and low performance  1.378160
52                         attitude_very happy -1.938233
53                       attitude_very unhappy  2.651998
54                            workload_extreme  2.282292
55                               workload_high -0.227062
56                                workload_low  0.383963
57                             workload_normal -1.056297
58                           workload_very low -1.380806
In [100]:
plot_roc(lr, X_fe_encoded_test, y_fe_encoded_test)
In [101]:
accuracies = pd.DataFrame(columns=['features','accuracy', 'cols'])
print('Iterations:')

for i in range(1, len(X_fe_encoded.columns)+1):
    logreg = LogisticRegression(solver='lbfgs', max_iter=250)
    rfe = RFE(logreg, i)
    rfe = rfe.fit(X_fe_encoded, y_fe_encoded.values.ravel())
    
    cols_rfe = list(X_fe_encoded.loc[:, rfe.support_])
    X_rfe_sel = X_fe_encoded_train[cols_rfe]
    X_rfe_test_sel = X_fe_encoded_test[cols_rfe]

    result = logreg.fit(X_rfe_sel, y_fe_encoded_train.values.ravel())
    acc_test = logreg.score(X_rfe_test_sel, y_fe_encoded_test)
    
    accuracies.loc[i] = [i, acc_test, cols_rfe]
    print(i, end='   ')
Iterations:
1   2   3   4   5   6   7   8   9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56   57   58   
In [102]:
# Line Plot
plt.figure(figsize=(15,5))
sns.lineplot(x = accuracies['features'],
             y = accuracies['accuracy'],
             color = 'steelblue')#.axes.set_xlim(min(hr.last_evaluation),max(hr.last_evaluation))
plt.tight_layout()
In [103]:
accuracies.nlargest(10, 'accuracy')
Out[103]:
features accuracy cols
14 14 0.967111 [number_project_cat_extreme, average_montly_ho...
18 18 0.966889 [number_project_cat_extreme, average_montly_ho...
19 19 0.966667 [number_project_cat_extreme, average_montly_ho...
20 20 0.966667 [Work_accident, number_project_cat_extreme, av...
15 15 0.966444 [number_project_cat_extreme, average_montly_ho...
16 16 0.966444 [number_project_cat_extreme, average_montly_ho...
17 17 0.966444 [number_project_cat_extreme, average_montly_ho...
22 22 0.965556 [Work_accident, number_project_cat_extreme, nu...
21 21 0.965333 [Work_accident, number_project_cat_extreme, av...
29 29 0.964889 [Work_accident, promotion_last_5years, number_...
In [104]:
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

features_rfe = list(hr_fe_encoded)
features_rfe.remove(target)

X_rfe = hr_fe_encoded.loc[:, features_rfe]
y_rfe = hr_fe_encoded.loc[:, target]

logreg = LogisticRegression(solver='lbfgs', max_iter=250)
rfe = RFE(logreg, accuracies.nlargest(1,'accuracy').features.values.ravel()[0])
rfe = rfe.fit(X_rfe, y_rfe)

print(sum(rfe.support_),'selected features:')
for i in list(X_rfe.loc[:, rfe.support_]):
    print(i)
14 selected features:
number_project_cat_extreme
average_montly_hours_bin_(287, 310]
satisfaction_level_bin_(0.00, 0.11]
satisfaction_level_bin_(0.92, 1.00]
efficiency_low
efficiency_very low
time_spend_company_cat_no departure
time_spend_company_cat_very high departure
attitude_normal
attitude_unhappy
attitude_very happy
attitude_very unhappy
workload_extreme
workload_normal
In [105]:
train = pd.read_csv('C:\\Users\\RIA SHARMA\\Desktop\\data\\data\\train.csv')
test = pd.read_csv('C:\\Users\\RIA SHARMA\\Desktop\\data\\data\\test.csv')
# getting their shapes
print("Shape of train :", train.shape)
print("Shape of test :", test.shape)
Shape of train : (54808, 14)
Shape of test : (23490, 13)
In [106]:
train.head()
Out[106]:
employee_id department region education gender recruitment_channel no_of_trainings age previous_year_rating length_of_service KPIs_met >80% awards_won? avg_training_score is_promoted
0 65438 Sales & Marketing region_7 Master's & above f sourcing 1 35 5.0 8 1 0 49 0
1 65141 Operations region_22 Bachelor's m other 1 30 5.0 4 0 0 60 0
2 7513 Sales & Marketing region_19 Bachelor's m sourcing 1 34 3.0 7 0 0 50 0
3 2542 Sales & Marketing region_23 Bachelor's m other 2 39 1.0 10 0 0 50 0
4 48945 Technology region_26 Bachelor's m other 1 45 3.0 2 0 0 73 0
In [107]:
test.head()
Out[107]:
employee_id department region education gender recruitment_channel no_of_trainings age previous_year_rating length_of_service KPIs_met >80% awards_won? avg_training_score
0 8724 Technology region_26 Bachelor's m sourcing 1 24 NaN 1 1 0 77
1 74430 HR region_4 Bachelor's f other 1 31 3.0 5 0 0 51
2 72255 Sales & Marketing region_13 Bachelor's m other 1 31 1.0 4 0 0 47
3 38562 Procurement region_2 Bachelor's f other 3 31 2.0 9 0 0 65
4 64486 Finance region_29 Bachelor's m sourcing 1 30 4.0 7 0 0 61
In [108]:
# describing the training set

train.describe(include = 'all')
Out[108]:
employee_id department region education gender recruitment_channel no_of_trainings age previous_year_rating length_of_service KPIs_met >80% awards_won? avg_training_score is_promoted
count 54808.000000 54808 54808 52399 54808 54808 54808.000000 54808.000000 50684.000000 54808.000000 54808.000000 54808.000000 54808.000000 54808.000000
unique NaN 9 34 3 2 3 NaN NaN NaN NaN NaN NaN NaN NaN
top NaN Sales & Marketing region_2 Bachelor's m other NaN NaN NaN NaN NaN NaN NaN NaN
freq NaN 16840 12343 36669 38496 30446 NaN NaN NaN NaN NaN NaN NaN NaN
mean 39195.830627 NaN NaN NaN NaN NaN 1.253011 34.803915 3.329256 5.865512 0.351974 0.023172 63.386750 0.085170
std 22586.581449 NaN NaN NaN NaN NaN 0.609264 7.660169 1.259993 4.265094 0.477590 0.150450 13.371559 0.279137
min 1.000000 NaN NaN NaN NaN NaN 1.000000 20.000000 1.000000 1.000000 0.000000 0.000000 39.000000 0.000000
25% 19669.750000 NaN NaN NaN NaN NaN 1.000000 29.000000 3.000000 3.000000 0.000000 0.000000 51.000000 0.000000
50% 39225.500000 NaN NaN NaN NaN NaN 1.000000 33.000000 3.000000 5.000000 0.000000 0.000000 60.000000 0.000000
75% 58730.500000 NaN NaN NaN NaN NaN 1.000000 39.000000 4.000000 7.000000 1.000000 0.000000 76.000000 0.000000
max 78298.000000 NaN NaN NaN NaN NaN 10.000000 60.000000 5.000000 37.000000 1.000000 1.000000 99.000000 1.000000
In [109]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
employee_id             54808 non-null int64
department              54808 non-null object
region                  54808 non-null object
education               52399 non-null object
gender                  54808 non-null object
recruitment_channel     54808 non-null object
no_of_trainings         54808 non-null int64
age                     54808 non-null int64
previous_year_rating    50684 non-null float64
length_of_service       54808 non-null int64
KPIs_met >80%           54808 non-null int64
awards_won?             54808 non-null int64
avg_training_score      54808 non-null int64
is_promoted             54808 non-null int64
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB
In [110]:
# checking if there is any NULL value in the dataset

train.isnull().any()
Out[110]:
employee_id             False
department              False
region                  False
education                True
gender                  False
recruitment_channel     False
no_of_trainings         False
age                     False
previous_year_rating     True
length_of_service       False
KPIs_met >80%           False
awards_won?             False
avg_training_score      False
is_promoted             False
dtype: bool
In [111]:
test.isnull().sum()
Out[111]:
employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64
In [112]:
# checkig the no. of Employees Promoted

train['is_promoted'].value_counts()
Out[112]:
0    50140
1     4668
Name: is_promoted, dtype: int64
In [113]:
# finding the %age of people promoted

promoted = (4668/54808)*100
print("Percentage of Promoted Employees is {:.2f}%".format(promoted))
Percentage of Promoted Employees is 8.52%
In [114]:
#plotting a scatter plot 

plt.hist(train['is_promoted'])
plt.title('plot to show the gap in Promoted and Non-Promoted Employees', fontsize = 30)
plt.xlabel('0 -No Promotion and 1- Promotion', fontsize = 20)
plt.ylabel('count')
plt.show()
In [115]:
# checking the distribution of the avg_training score of the Employees

plt.rcParams['figure.figsize'] = (15, 7)
sns.distplot(train['avg_training_score'], color = 'blue')
plt.title('Distribution of Training Score among the Employees', fontsize = 30)
plt.xlabel('Average Training Score', fontsize = 20)
plt.ylabel('count')
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
In [116]:
train['awards_won?'].value_counts()
Out[116]:
0    53538
1     1270
Name: awards_won?, dtype: int64
In [117]:
# plotting a donut chart for visualizing each of the recruitment channel's share

size = [53538, 1270]
colors = ['magenta', 'brown']
labels = "Awards Won", "NO Awards Won"

my_circle = plt.Circle((0, 0), 0.7, color = 'white')

plt.rcParams['figure.figsize'] = (9, 9)
plt.pie(size, colors = colors, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Showing a Percentage of employees who won awards', fontsize = 30)
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.legend()
plt.show()
In [118]:
train['KPIs_met >80%'].value_counts()
Out[118]:
0    35517
1    19291
Name: KPIs_met >80%, dtype: int64
In [119]:
# plotting a pie chart


size = [35517, 19291]
labels = "Not Met KPI > 80%", "Met KPI > 80%"
colors = ['violet', 'grey']
explode = [0, 0.1]

plt.rcParams['figure.figsize'] = (8, 8)
plt.pie(size, labels = labels, colors = colors, explode = explode, shadow = True, autopct = "%.2f%%")
plt.title('A Pie Chart Representing Gap in Employees in terms of KPI', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()
In [120]:
# checking the distribution of length of service

sns.distplot(train['length_of_service'], color = 'green')
plt.title('Distribution of length of service among the Employees', fontsize = 30)
plt.xlabel('Length of Service in years', fontsize = 15)
plt.ylabel('count')
plt.show()
In [121]:
train['previous_year_rating'].value_counts().sort_values().plot.bar(color = 'violet', figsize = (15, 7))
plt.title('Distribution of Previous year rating of the Employees', fontsize = 30)
plt.xlabel('Ratings', fontsize = 15)
plt.ylabel('count')
plt.show()
In [122]:
# checking the distribution of age of Employees in the company

sns.distplot(train['age'], color = 'red')
plt.title('Distribution of Age of Employees', fontsize = 30)
plt.xlabel('Age', fontsize = 15)
plt.ylabel('count')
plt.show()
In [123]:
# checking the different no. of training done by the employees

plt.rcParams['figure.figsize'] = (17, 7)
sns.violinplot(train['no_of_trainings'], color = 'purple')
plt.title('No. of trainings done by the Employees', fontsize = 30)
plt.xlabel('No. of Trainings', fontsize = 15)
plt.ylabel('Frequency')
plt.show()
In [124]:
# checking the different types of recruitment channels for the company

train['recruitment_channel'].value_counts()
Out[124]:
other       30446
sourcing    23220
referred     1142
Name: recruitment_channel, dtype: int64
In [125]:
# plotting a donut chart for visualizing each of the recruitment channel's share

size = [30446, 23220, 1142]
colors = ['yellow', 'red', 'lightgreen']
labels = "Others", "Sourcing", "Reffered"

my_circle = plt.Circle((0, 0), 0.7, color = 'white')

plt.rcParams['figure.figsize'] = (9, 9)
plt.pie(size, colors = colors, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Showing share of different Recruitment Channels', fontsize = 30)
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.legend()
plt.show()
In [126]:
# checking the gender gap

train['gender'].value_counts()
Out[126]:
m    38496
f    16312
Name: gender, dtype: int64
In [127]:
# plotting a pie chart

size = [38496, 16312]
labels = "Male", "Female"
colors = ['yellow', 'orange']
explode = [0, 0.1]

plt.rcParams['figure.figsize'] = (8, 8)
plt.pie(size, labels = labels, colors = colors, explode = explode, shadow = True, autopct = "%.2f%%")
plt.title('A Pie Chart Representing GenderGap', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()
In [128]:
# checking the different regions of the company

plt.rcParams['figure.figsize'] = (20, 10)
sns.countplot(train['region'], color = 'pink')
plt.title('Different Regions in the company', fontsize = 30)
plt.xticks(rotation = 60)
plt.xlabel('Region Code', fontsize = 15)
plt.ylabel('count', fontsize = 15)
plt.show()

Bi-varaiate Data Visualization

In [129]:
# scatter plot between average training score and is_promoted

data = pd.crosstab(train['avg_training_score'], train['is_promoted'])
data.div(data.sum(1).astype(float), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 9), color = ['darkred', 'lightgreen'])

plt.title('Looking at the Dependency of Training Score in promotion', fontsize = 30)
plt.xlabel('Average Training Scores', fontsize = 15)
plt.legend()
plt.show()
In [130]:
# checking dependency of different regions in promotion

data = pd.crosstab(train['region'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 8), color = ['lightblue', 'purple'])

plt.title('Dependency of Regions in determining Promotion of Employees', fontsize = 30)
plt.xlabel('Different Regions of the Company', fontsize = 20)
plt.legend()
plt.show()
In [131]:
# dependency of awards won on promotion

data = pd.crosstab(train['awards_won?'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (10, 8), color = ['magenta', 'purple'])

plt.title('Dependency of Awards in determining Promotion', fontsize = 30)
plt.xlabel('Awards Won or Not', fontsize = 20)
plt.legend()
plt.show()
In [132]:
#dependency of KPIs with Promotion

data = pd.crosstab(train['KPIs_met >80%'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (10, 8), color = ['pink', 'darkred'])

plt.title('Dependency of KPIs in determining Promotion', fontsize = 30)
plt.xlabel('KPIs Met or Not', fontsize = 20)
plt.legend()
plt.show()
In [133]:
# checking dependency on previous years' ratings

data = pd.crosstab(train['previous_year_rating'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (15, 8), color = ['violet', 'pink'])

plt.title('Dependency of Previous year Ratings in determining Promotion', fontsize = 30)
plt.xlabel('Different Ratings', fontsize = 20)
plt.legend()
plt.show()
In [134]:
# checking how length of service determines the promotion of employees

data = pd.crosstab(train['length_of_service'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 8), color = ['pink', 'lightblue'])

plt.title('Dependency of Length of service in Promotions of Employees', fontsize = 30)
plt.xlabel('Length of service of employees', fontsize = 20)
plt.legend()
plt.show()
In [135]:
# checking dependency of age factor in promotion of employees

data = pd.crosstab(train['age'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 8), color = ['lightblue', 'green'])

plt.title('Dependency of Age in determining Promotion of Employees', fontsize = 30)
plt.xlabel('Age of Employees', fontsize = 20)
plt.legend()
plt.show()
In [136]:
# checking which department got most number of promotions

data = pd.crosstab(train['department'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 8), color = ['orange', 'lightgreen'])

plt.title('Dependency of Departments in determining Promotion of Employees', fontsize = 30)
plt.xlabel('Different Departments of the Company', fontsize = 20)
plt.legend()
plt.show()
In [137]:
# checking dependency of gender over promotion

data = pd.crosstab(train['gender'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (7, 5), color = ['pink', 'yellow'])

plt.title('Dependency of Genders in determining Promotion of Employees', fontsize = 30)
plt.xlabel('Gender', fontsize = 20)
plt.legend()
plt.show()

Data Pre-processing

In [138]:
# filling missing values

train['education'].fillna(train['education'].mode()[0], inplace = True)
train['previous_year_rating'].fillna(1, inplace = True)

# again checking if there is any Null value left in the data
train.isnull().sum().sum()
Out[138]:
0
In [139]:
# filling missing values

test['education'].fillna(test['education'].mode()[0], inplace = True)
test['previous_year_rating'].fillna(1, inplace = True)

# again checking if there is any Null value left in the data
test.isnull().sum().sum()
Out[139]:
0
In [140]:
# removing the employee_id column

train = train.drop(['employee_id'], axis = 1)

train.columns
Out[140]:
Index(['department', 'region', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score', 'is_promoted'],
      dtype='object')
In [141]:
# saving the employee_id

emp_id = test['employee_id']

# removing the employee_id column

test = test.drop(['employee_id'], axis = 1)

test.columns
Out[141]:
Index(['department', 'region', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score'],
      dtype='object')
In [142]:
# defining the test set

x_test = test

x_test.columns
Out[142]:
Index(['department', 'region', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score'],
      dtype='object')
In [143]:
# one hot encoding for the test set

x_test = pd.get_dummies(x_test)

x_test.columns
Out[143]:
Index(['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score',
       'department_Analytics', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_R&D', 'department_Sales & Marketing',
       'department_Technology', 'region_region_1', 'region_region_10',
       'region_region_11', 'region_region_12', 'region_region_13',
       'region_region_14', 'region_region_15', 'region_region_16',
       'region_region_17', 'region_region_18', 'region_region_19',
       'region_region_2', 'region_region_20', 'region_region_21',
       'region_region_22', 'region_region_23', 'region_region_24',
       'region_region_25', 'region_region_26', 'region_region_27',
       'region_region_28', 'region_region_29', 'region_region_3',
       'region_region_30', 'region_region_31', 'region_region_32',
       'region_region_33', 'region_region_34', 'region_region_4',
       'region_region_5', 'region_region_6', 'region_region_7',
       'region_region_8', 'region_region_9', 'education_Bachelor's',
       'education_Below Secondary', 'education_Master's & above', 'gender_f',
       'gender_m', 'recruitment_channel_other', 'recruitment_channel_referred',
       'recruitment_channel_sourcing'],
      dtype='object')
In [144]:
# splitting the train set into dependent and independent sets

x = train.iloc[:, :-1]
y = train.iloc[:, -1]

print("Shape of x:", x.shape)
print("Shape of y:", y.shape)
Shape of x: (54808, 12)
Shape of y: (54808,)
In [145]:
# one hot encoding for the train set

x = pd.get_dummies(x)

x.columns
Out[145]:
Index(['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score',
       'department_Analytics', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_R&D', 'department_Sales & Marketing',
       'department_Technology', 'region_region_1', 'region_region_10',
       'region_region_11', 'region_region_12', 'region_region_13',
       'region_region_14', 'region_region_15', 'region_region_16',
       'region_region_17', 'region_region_18', 'region_region_19',
       'region_region_2', 'region_region_20', 'region_region_21',
       'region_region_22', 'region_region_23', 'region_region_24',
       'region_region_25', 'region_region_26', 'region_region_27',
       'region_region_28', 'region_region_29', 'region_region_3',
       'region_region_30', 'region_region_31', 'region_region_32',
       'region_region_33', 'region_region_34', 'region_region_4',
       'region_region_5', 'region_region_6', 'region_region_7',
       'region_region_8', 'region_region_9', 'education_Bachelor's',
       'education_Below Secondary', 'education_Master's & above', 'gender_f',
       'gender_m', 'recruitment_channel_other', 'recruitment_channel_referred',
       'recruitment_channel_sourcing'],
      dtype='object')
In [ ]: